
*Filename: 1_individual-clean.do
*Created: 20201014
*Last edited: 20220413

/*Description: 

NOTE: need "renvars" (findit dm88_1)v

This file pulls the desired PSID variables, and does some 
very minor variable cleaning/creation.

The input file and output file are in WIDE format. There is one observation
for each individual in the PSID data (restricted to SEO and SRC samples below)
and all of the information for that individual is included WIDE, with the survey
wave numbers appended to the variable names. 

Cleaning:
 Age/cohort variables 
 Hours variables 
 Schooling: 1-16, 17 for grad school 
 Incomes
	- create flags for: topcoded, bottom coded, negative family income
	- replace some wild code values with missing
	- CPI adjustment (to 2016 dollars)
 Use flags to get obs non-imputed values (specifically those not imputed by major assignment)
	This is actually done in separate do-file: accuracy-flags.do
	which is executed towards end of this file.
*/

/* The PSID waves/variables/years line up as follows	
		 (incomes/hours are reported for year prior to survey)											 
	1	1968		11	1978		21	1988		31	1999	41 2019
	2	1969		12	1979		22	1989		32	2001
	3	1970		13	1980		23	1990		33	2003
	4	1971		14	1981		24	1991		34	2005
	5	1972		15	1982		25	1992		35	2007
	6	1973		16	1983		26	1993		36	2009
	7	1974		17	1984		27	1994		37	2011
	8	1975		18	1985		28	1995		38  2013	
	9	1976		19	1986		29	1996		39  2015	
	10	1977		20	1987		30	1997		40	2017
*/

******************************************************* 
clear 
//version 16.0
set more off
capture log close 
set maxvar 32000

 
*Log file
log using ${projdata}/individual-clean.log, replace



*Number of waves to uses
gl NW=41

*Variables from raw data to use
loc incomes 	"earntothd? earntothd?? earntotwf? earntotwf?? earnbusin? earnbusin?? earnbuswfin? earnbuswfin?? faminc? faminc?? frmincin? frmincin??"
loc incomeacc 	"acc*"
loc occupschl   "schlast sch? sch?? "
loc parentschl  "fathereduhdin? fathereduhdin?? mothereduhdin? mothereduhdin?? fathereduwfin? fathereduwfin?? mothereduwfin? mothereduwfin??"
loc lfstatus    "hourshd? hourshd?? hourswf? hourswf??" // also spouse variables available for LFstatus
loc idvars		"female src id? id?? seq? seq?? rel? rel?? head? head?? wife? wife?? iwgt? iwgt?? famwgt? famwgt?? er31996 er31997"
loc agevars		"age? age?? birthyr? birthyr?? birthyr  agehd? agehd?? agewf? agewf??"


*Get raw data
clear
use `incomes' `incomeacc' `occupschl' `parentschl' `lfstatus' `idvars' `agevars' using ${rawdata}/psidmain_clean.dta



*Rename "in" vars to simplify code
renvars, subst(earnbusin earnbushd)
renvars, subst(earnbuswfin earnbuswf)
renvars, subst(frmincin farminc)

*Rename sampling (cluster and stratum) variables
rename er31997 cluster
rename er31996 stratum

*SEO sample indicator
gen seo=0
replace seo=1 if id1>=5000 & id1<=6875

*Keep only SRC and SEO samples
drop if src==0 & seo==0

* Unique Individual ID
  gen newid=(id1*1000)+seq1


  
**************************************
*** AGE / COHORT variables

*Clean reported age of head/wife 
forv i=1/$NW {
	replace agehd`i'=. if (agehd`i'==0 | agehd`i'>=97) 
	replace agewf`i'=. if (agewf`i'==0 | agewf`i'>=97)
}

*Clean reported (own) age 
forv i=1/$NW {
	replace age`i'=.     if (age`i'==0   | age`i'==999) 
	replace birthyr`i'=. if (birthyr`i'==0 | birthyr`i'==9999)
}


*Create cohort variable based on (first) reported age 	
gen cohort=.
forv i=1/30 {
	local year=1967+`i'
	replace cohort= `year'-age`i' if age`i'!=. & cohort==.
	di "`year' - wave `i'"
}
forv i=31/$NW {
	local year=1997+(`i'-30)*2
	replace cohort= `year'-age`i' if age`i'!=. & cohort==.
	di "`year' - wave `i'"
}



**************************************
***  SCHOOLING 

/*
schlast = last reported educational attainment, created in m_psid.do 
*/
egen schmax=rowmax(sch? sch??)	




**************************************
***  HOURS

*Recode Latino sample value and wild codes to missing
forv i=1/$NW {
 replace hourshd`i'=. if inlist(hourshd`i',9999,6730,7800,7980)
 replace hourswf`i'=. if inlist(hourswf`i',9999,6730,7800,7980)
}


**************************************
***  ANNUAL INCOME MEASURES



** LABOR INCOME
 
*Create empty income vars
forv i=1/$NW {
 foreach var in inc   {
  qui gen `var'hd`i'=.			
  qui gen `var'wf`i'=.			
  label var `var'hd`i' 	"HEAD Labor income" // PSID changed terminology in 2017 to "Reference Person" instead of Head
  label var `var'wf`i' 	"WIFE Labor income" // PSID changed terminology in 2015 to Spouse/Partner instead of Wife/"Wife"
  } 
}


*Clean up some labor income variables in 1994-1995 (odd meaning in some years: means Latino sample family, not an actual dollar amount)
*1994-1995
foreach i in 27 28 {
	replace farminc`i'=.   	 if farminc`i'==999999  
	replace earnbushd`i'=.   if earnbushd`i'==999999  
	replace earnbuswf`i'=.   if earnbuswf`i'==999999 
	replace earntothd`i'=.   if earntothd`i'==9999999
	replace earntotwf`i'=.   if earntotwf`i'==9999999
}


*Fill in income vars
foreach var in inc {	  
*1968-2011
forv i=1/$NW {
   replace `var'hd`i'=earntothd`i'     					 	
   replace `var'wf`i'=earntotwf`i'     				 	
  }
}
/* NOTE: 4b_analysis-sample.do has option to add farm income and labor part of business income  for 1993-2018 incomes (1994-2019 surveys)
		 The labor part of farm and business income was included in the total labor earnings variables (earntot) in the 1968-1993 surveys,
		 but then excluded for the 1994-2019 surveys. 
*/





** FAMILY INCOME (includes labor, asset, and transfer income)

*Create empty income vars
forv i=1/$NW {
 foreach var in fminc   {
  qui gen `var'`i'=.			// FAMILY income
  label var `var'`i' "Family income"
  } 
}

*Clean up family income variable (odd meaning in some years: means Latino sample family, but not an actual dollar amount)
*1994-1995
forv i=27/28 {
   replace faminc`i'=. if faminc`i'==9999999   
  }

*Fill in income vars
foreach var in inc {	  
forv i=1/$NW {
   replace fm`var'`i'=faminc`i'     
  }
}






************************************** 
*** FLAGS FOR TOP/BOTTOM CODED VALUES

*Create flags for top-coded LABOR incomes
foreach var in inc {
forv i=1/$NW {
	gen F_`var'hd_top`i'=.
	gen F_`var'wf_top`i'=.
}
*1968-1982 surveys
forv i=1/15 {
	replace F_`var'hd_top`i'=1 if earntothd`i'==99999
	replace F_`var'wf_top`i'=1 if earntotwf`i'==99999
}
*1983 survey 
forv i=16/16 {
	replace F_`var'hd_top`i'=1 if earntothd`i'==999999
	replace F_`var'wf_top`i'=1 if earntotwf`i'==99999	
}
*1984-1992 surveys	
forv i=17/25 {
	replace F_`var'hd_top`i'=1 if earntothd`i'==999999
	replace F_`var'wf_top`i'=1 if earntotwf`i'==999999		
}
*1993-1996 surveys
forv i=26/29 {
	replace F_`var'hd_top`i'=1 if earntothd`i'==9999999
	replace F_`var'wf_top`i'=1 if earntotwf`i'==9999999			
}
*1997 survey
forv i=30/30 {
	replace F_`var'hd_top`i'=1 if earntothd`i'==999999
	replace F_`var'wf_top`i'=1 if earntotwf`i'==999999	 		
}
*1999 survey
forv i=31/31 {
	replace F_`var'hd_top`i'=1 if earntothd`i'==9999999
	replace F_`var'wf_top`i'=1 if earntotwf`i'==9999997	 		
}
*1999-2019 surveys
forv i=32/$NW {
	replace F_`var'hd_top`i'=1 if earntothd`i'==9999999
	replace F_`var'wf_top`i'=1 if earntotwf`i'==9999999			
}

} // end inc top/bot code loop



*Flags for top/bottom coded FARM income
foreach var in farminc {
forv i=1/$NW {	
	gen F_`var'_top`i'=.
	gen F_`var'_bot`i'=.
}
*1999-2009
forv i=31/36 {
	replace F_`var'_top`i'=1 if farminc`i'==9999999
}
*1994
forv i=27/27 {
	replace F_`var'_bot`i'=1 if farminc`i'==-99999	
}
*1995-1996
forv i=28/29 {
	replace F_`var'_bot`i'=1 if farminc`i'==-9999	
}
*1997-2005
forv i=30/34 {
	replace F_`var'_bot`i'=1 if farminc`i'==-99999	
}
*2009
forv i=36/36 {
	replace F_`var'_bot`i'=1 if farminc`i'==-99999	
}
}


*Create flags for top- or bottom-coded FAMILY incomes 
foreach var in fminc {

*1968 survey (Codebook says "0=None, 1-82,075 Actual amount" so no top/bottom coding)	
forv i=1/$NW {	
	gen F_`var'_top`i'=.
	gen F_`var'_bot`i'=.
}
	
*1969-1971 surveys 
/*Codebooks: 1969: "0=Zero money income" 
			 1970: "0=No money income" 
			 1971: "-750 to -1 = Actual loss" and "0=No family money income" though lists 0 obs in either category */
forv i=2/4 {
	replace F_`var'_top`i'=1   if faminc`i'==99999	
}
*1972-1979 surveys
forv i=5/12 {
	replace F_`var'_top`i'=1   if faminc`i'==99999	
	replace F_`var'_bot`i'=1   if faminc`i'==1	
}
*1980
forv i=13/13 {
	replace F_`var'_top`i'=1   if faminc`i'==999999
	replace F_`var'_bot`i'=1   if faminc`i'==1	
}
*1981-1983
forv i=14/16 {
	replace F_`var'_top`i'=1   if faminc`i'==9999999
	replace F_`var'_bot`i'=1   if faminc`i'==1	
}
*1984-1985
forv i=17/18 {
	replace F_`var'_top`i'=1   if faminc`i'==999999
	replace F_`var'_bot`i'=1   if faminc`i'==1	
}
*1986-1993
forv i=19/26 {
	replace F_`var'_top`i'=1   if faminc`i'==9999999
	replace F_`var'_bot`i'=1   if faminc`i'==1	
}
*1994-1996
forv i=27/29 {
	replace F_`var'_top`i'=1   if faminc`i'==9999999
	replace F_`var'_bot`i'=1   if faminc`i'==-999999	
}
*1997: NO TOP OR BOTTOM CODED VALUES LISTED IN CODEBOOK -- ALL ACTUAL VALUES
*1999-2003
forv i=31/33 {
	replace F_`var'_top`i'=1   if faminc`i'==9999999
}
*2005-2009
forv i=34/36 {
	replace F_`var'_top`i'=1   if faminc`i'==9999999
	replace F_`var'_bot`i'=1   if faminc`i'==-999999	
}
*2011-2019: NO TOP OR BOTTOM CODED VALUES LISTED IN CODEBOOK -- ALL ACTUAL VALUES
} // end fminc top/bot code loop



**************************************
*** ADJUST FOR INFLATION									

foreach var in inchd incwf fminc earnbushd earnbuswf farminc {
	forv i=1/$NW {
		gen r`var'`i'=.
	}
*First divide by current CPI
cap noi replace r`var'1=`var'1 / 33.4     // 1967 income
cap noi replace r`var'2=`var'2 / 34.8     // 1968 income
cap noi replace r`var'3=`var'3 / 36.7     // 1969 income
cap noi replace r`var'4=`var'4 / 38.8     // 1970 income
cap noi replace r`var'5=`var'5 / 40.5     // 1971 income
cap noi replace r`var'6=`var'6 / 41.8     // 1972 income
cap noi replace r`var'7=`var'7 / 44.4     // 1973 income
cap noi replace r`var'8=`var'8 / 49.3     // 1974 income
cap noi replace r`var'9=`var'9 / 53.8     // 1975 income
cap noi replace r`var'10=`var'10 / 56.9   // 1976 income
cap noi replace r`var'11=`var'11 / 60.6   // 1977 income
cap noi replace r`var'12=`var'12 / 65.2   // 1978 income
cap noi replace r`var'13=`var'13 / 72.6   // 1979 income
cap noi replace r`var'14=`var'14 / 82.4   // 1980 income
cap noi replace r`var'15=`var'15 / 90.9   // 1981 income
cap noi replace r`var'16=`var'16 / 96.5   // 1982 income
cap noi replace r`var'17=`var'17 / 99.6   // 1983 income
cap noi replace r`var'18=`var'18 / 103.9  // 1984 income
cap noi replace r`var'19=`var'19 / 107.6  // 1985 income
cap noi replace r`var'20=`var'20 / 109.6  // 1986 income
cap noi replace r`var'21=`var'21 / 113.6  // 1987 income
cap noi replace r`var'22=`var'22 / 118.3  // 1988 income
cap noi replace r`var'23=`var'23 / 124    // 1989 income
cap noi replace r`var'24=`var'24 / 130.7  // 1990 income
cap noi replace r`var'25=`var'25 / 136.2  // 1991 income
cap noi replace r`var'26=`var'26 / 140.3  // 1992 income
cap noi replace r`var'27=`var'27 / 144.5  // 1993 income
cap noi replace r`var'28=`var'28 / 148.2  // 1994 income
cap noi replace r`var'29=`var'29 / 152.4  // 1995 income
cap noi replace r`var'30=`var'30 / 156.9  // 1996 income
 // biennial...
cap noi replace r`var'31=`var'31 / 163     	// 1998 income
cap noi replace r`var'32=`var'32 / 172.2    // 2000 income
cap noi replace r`var'33=`var'33 / 179.9    // 2002 income
cap noi replace r`var'34=`var'34 / 188.9    // 2004 income
cap noi replace r`var'35=`var'35 / 201.6    // 2006 income
cap noi replace r`var'36=`var'36 / 215.30   // 2008 income
cap noi replace r`var'37=`var'37 / 218.06   // 2010 income
cap noi replace r`var'38=`var'38 / 229.59   // 2012 income
cap noi replace r`var'39=`var'39 / 236.74   // 2014 income
cap noi replace r`var'40=`var'40 / 240.01   // 2016 income
cap noi replace r`var'41=`var'41 / 251.11   // 2018 income
}


*Convert to 2016 dollars to match Swedish data
foreach var in inchd incwf fminc earnbushd earnbuswf farminc {
forv i=1/$NW {
replace r`var'`i'=r`var'`i'*240.01				
}
}





**************************** 
*** FLAG NON-IMPUTED VALUES

do ${code}/PSIDprep/accuracy-flags.do

su S_inchd*

su S_incwf*

su S_businc*

su S_frminc*

su S_nonlabinc*





**************************** 
*** KEEP VARS AND SAVE

*Keep only desired variables
loc incomes 	"inchd? inchd?? incwf? incwf?? fminc? fminc??  rinchd? rinchd?? rincwf? rincwf?? rfminc? rfminc?? earnbushd? earnbushd?? earnbuswf? earnbuswf?? rearnbushd? rearnbushd?? rearnbuswf? rearnbuswf?? farminc? farminc?? rfarminc? rfarminc??"
loc idvars		"newid id? id?? seq? seq?? rel? rel?? head? head?? wife? wife?? iwgt? iwgt?? famwgt? famwgt?? female src cluster stratum seo"
loc incomeacc 	"S_* F_*"
loc occupschl   "schlast schmax sch? sch??"
loc agevars		"age? age?? birthyr? birthyr?? birthyr agehd? agehd?? agewf? agewf?? cohort"
loc othvars		"hourshd? hourshd?? hourswf? hourswf??"

keep `incomes' `incomeacc' `occupschl' `idvars' `agevars' `lfstatus' `othvars'


*SAVE cleaned parent identification links
qui compress
save ${projdata}/individual-clean.dta, replace




clear
log close

*End 1_individual-clean.do*
